05 - Facies Classifier

George Crowther

This is an extension / amalgamation of prior entries. The workflow remains not dissimilar to those completed previously, this is:

Load and set strings to integers
Cursory data examination, this workbook does not attempt to detail the full data analysis
Group data by well and brute force feature creation
- Feature creation focuses on bringing results from adjacent samples into features
- Look at some ratios between features
Used TPOT to train a classifier (exported_pipeline)
Feature creation and extraction on test dataset
Result prediction



In [1]:

    
import pandas as pd
import bokeh.plotting as bk
import numpy as np

from sklearn import preprocessing
from sklearn.model_selection import train_test_split

from tpot import TPOTClassifier, TPOTRegressor

import sys
sys.path.append('~/home/slygeorge/Documents/Python/SEG ML Competition')

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:80% !important; }</style>"))

bk.output_notebook()









    











    





    
        
        Loading BokehJS ...



In [2]:

    
# Input file paths
train_path = '../training_data.csv'

# Read training data to dataframe
train = pd.read_csv(train_path)

# TPOT library requires that the target class is renamed to 'class'
train.rename(columns={'Facies': 'class'}, inplace=True)

well_names = train['Well Name']

facies_labels = ['SS', 'CSiS', 'FSiS', 'SiSh', 'MS', 'WS', 'D', 'PS', 'BS']



In [3]:

    
train.head()



In [4]:

    
train.dropna().describe()









    Out[4]:






  
    
      
      class
      Depth
      GR
      ILD_log10
      DeltaPHI
      PHIND
      PE
      NM_M
      RELPOS
    
  
  
    
      count
      3232.000000
      3232.000000
      3232.000000
      3232.000000
      3232.000000
      3232.000000
      3232.000000
      3232.000000
      3232.000000
    
    
      mean
      4.422030
      2875.824567
      66.135769
      0.642719
      3.559642
      13.483213
      3.725014
      1.498453
      0.520287
    
    
      std
      2.504243
      131.006274
      30.854826
      0.241845
      5.228948
      7.698980
      0.896152
      0.500075
      0.286792
    
    
      min
      1.000000
      2573.500000
      13.250000
      -0.025949
      -21.832000
      0.550000
      0.200000
      1.000000
      0.010000
    
    
      25%
      2.000000
      2791.000000
      46.918750
      0.492750
      1.163750
      8.346750
      3.100000
      1.000000
      0.273000
    
    
      50%
      4.000000
      2893.500000
      65.721500
      0.624437
      3.500000
      12.150000
      3.551500
      1.000000
      0.526000
    
    
      75%
      6.000000
      2980.000000
      79.626250
      0.812735
      6.432500
      16.453750
      4.300000
      2.000000
      0.767250
    
    
      max
      9.000000
      3122.500000
      361.150000
      1.480000
      18.600000
      84.400000
      8.094000
      2.000000
      1.000000



In [5]:

    
# Some quick-look plots, PE has been highlighted, as this appears to be missing from the alternative version of the training dataset
plots = []
for well, group in train.groupby('Well Name'):
    group = group.sort_values(by = 'Depth')
    plots.append(bk.figure(height = 500, width = 150))
    plots[-1].line(group['PE'], group['Depth'], color = 'blue')
    plots[-1].line(group['DeltaPHI'], group['Depth'], color = 'red')
    plots[-1].title.text = well
    
grid = bk.gridplot([plots])
bk.show(grid)



In [6]:

    
# Set string features to integers

for i, value in enumerate(train['Formation'].unique()):
    train.loc[train['Formation'] == value, 'Formation'] = i
    
for i, value in enumerate(train['Well Name'].unique()):
    train.loc[train['Well Name'] == value, 'Well Name'] = i



In [7]:

    
# Used to reassign index, initally after attempting to upsample results

train['orig_index'] = train.index



In [8]:

    
# Define resample factors
resample_factors = [2, 5, 10, 25, 50]

initial_columns = ['Formation', 'Well Name', 'Depth', 'GR', 'ILD_log10',
       'DeltaPHI', 'PHIND', 'PE', 'NM_M', 'RELPOS']
div_columns = ['Depth', 'GR', 'ILD_log10',
       'DeltaPHI', 'PHIND', 'PE', 'NM_M', 'RELPOS']



In [9]:

    
# Use rolling windows through upsampled frame, grouping by well name.

# Empty list to hold frames
mean_frames = []
above = []
below = []

for well, group in train.groupby('Well Name'):
    # Empty list to hold rolling frames
    constructor_list = []
    for f in resample_factors:
        
        working_frame = group[['Depth', 'GR', 'ILD_log10', 'DeltaPHI', 'PHIND', 'PE', 'NM_M',
       'RELPOS']]
        
        mean_frame = working_frame.rolling(window = f, center = True).mean().interpolate(method = 'index', limit_direction = 'both', limit = None)
        mean_frame.columns = ['Mean_{0}_{1}'.format(f, column) for column in mean_frame.columns]
        max_frame = working_frame.rolling(window = f, center = True).max().interpolate(method = 'index', limit_direction = 'both', limit = None)
        max_frame.columns = ['Max_{0}_{1}'.format(f, column) for column in max_frame.columns]
        min_frame = working_frame.rolling(window = f, center = True).min().interpolate(method = 'index', limit_direction = 'both', limit = None)
        min_frame.columns = ['Min_{0}_{1}'.format(f, column) for column in min_frame.columns]
        std_frame = working_frame.rolling(window = f, center = True).std().interpolate(method = 'index', limit_direction = 'both', limit = None)
        std_frame.columns = ['Std_{0}_{1}'.format(f, column) for column in std_frame.columns]
        var_frame = working_frame.rolling(window = f, center = True).var().interpolate(method = 'index', limit_direction = 'both', limit = None)
        var_frame.columns = ['Var_{0}_{1}'.format(f, column) for column in var_frame.columns]
        diff_frame = working_frame.diff(f, axis = 0).interpolate(method = 'index', limit_direction = 'both', limit = None)
        diff_frame.columns = ['Diff_{0}_{1}'.format(f, column) for column in diff_frame.columns]
        rdiff_frame = working_frame.sort_index(ascending = False).diff(f, axis = 0).interpolate(method = 'index', limit_direction = 'both', limit = None).sort_index()
        rdiff_frame.columns = ['Rdiff_{0}_{1}'.format(f, column) for column in rdiff_frame.columns]
        skew_frame = working_frame.rolling(window = f, center = True).skew().interpolate(method = 'index', limit_direction = 'both', limit = None)
        skew_frame.columns = ['Skew_{0}_{1}'.format(f, column) for column in skew_frame.columns]
        
        f_frame = pd.concat((mean_frame, max_frame, min_frame, std_frame, var_frame, diff_frame, rdiff_frame), axis = 1)
        
        constructor_list.append(f_frame)
        
    well_frame = pd.concat(constructor_list, axis = 1)
    well_frame['class'] = group['class']
    well_frame['Well Name'] = well
    # orig index is holding the original index locations, to make extracting the results trivial
    well_frame['orig_index'] = group['orig_index']
    df = group.sort_values('Depth')
    u = df.shift(-1).fillna(method = 'ffill')
    b = df.shift(1).fillna(method = 'bfill')
    above.append(u[div_columns])
    below.append(b[div_columns])
    
    mean_frames.append(well_frame.fillna(method = 'bfill').fillna(method = 'ffill'))



In [10]:

    
# Concatenate all sub-frames together into single 'upsampled_frane'
frame = train
frame.index = frame['orig_index']
frame.drop(['orig_index', 'class', 'Well Name'], axis = 1, inplace = True)

for f in mean_frames:
    f.index = f['orig_index']

rolling_frame = pd.concat(mean_frames, axis = 0)
above_frame = pd.concat(above)
above_frame.columns = ['above_'+ column for column in above_frame.columns]
below_frame = pd.concat(below)
below_frame.columns = ['below_'+ column for column in below_frame.columns]
upsampled_frame = pd.concat((frame, rolling_frame, above_frame, below_frame), axis = 1)



In [11]:

    
# Features is the column set used for training the model
features = [feature for feature in upsampled_frame.columns if 'class' not in feature]



In [12]:

    
# Normalise dataset
std_scaler = preprocessing.StandardScaler().fit(upsampled_frame[features])

train_std = std_scaler.transform(upsampled_frame[features])

train_std_frame = upsampled_frame
for i, column in enumerate(features):
    train_std_frame.loc[:, column] = train_std[:, i]

upsampled_frame_std = train_std_frame



In [13]:

    
# Create ratios between features
div_columns = ['Depth', 'GR', 'ILD_log10',
       'DeltaPHI', 'PHIND', 'PE', 'NM_M', 'RELPOS']

for feature in div_columns:
    for f in div_columns:
        if f == feature:
            continue
        upsampled_frame['{0}_{1}'.format(feature, f)] = upsampled_frame[f] / upsampled_frame[feature]



In [14]:

    
features = []
[features.append(column) for column in upsampled_frame.columns if 'class' not in column]
print(features)









    



['Formation', 'Depth', 'GR', 'ILD_log10', 'DeltaPHI', 'PHIND', 'PE', 'NM_M', 'RELPOS', 'Mean_2_Depth', 'Mean_2_GR', 'Mean_2_ILD_log10', 'Mean_2_DeltaPHI', 'Mean_2_PHIND', 'Mean_2_PE', 'Mean_2_NM_M', 'Mean_2_RELPOS', 'Max_2_Depth', 'Max_2_GR', 'Max_2_ILD_log10', 'Max_2_DeltaPHI', 'Max_2_PHIND', 'Max_2_PE', 'Max_2_NM_M', 'Max_2_RELPOS', 'Min_2_Depth', 'Min_2_GR', 'Min_2_ILD_log10', 'Min_2_DeltaPHI', 'Min_2_PHIND', 'Min_2_PE', 'Min_2_NM_M', 'Min_2_RELPOS', 'Std_2_Depth', 'Std_2_GR', 'Std_2_ILD_log10', 'Std_2_DeltaPHI', 'Std_2_PHIND', 'Std_2_PE', 'Std_2_NM_M', 'Std_2_RELPOS', 'Var_2_Depth', 'Var_2_GR', 'Var_2_ILD_log10', 'Var_2_DeltaPHI', 'Var_2_PHIND', 'Var_2_PE', 'Var_2_NM_M', 'Var_2_RELPOS', 'Diff_2_Depth', 'Diff_2_GR', 'Diff_2_ILD_log10', 'Diff_2_DeltaPHI', 'Diff_2_PHIND', 'Diff_2_PE', 'Diff_2_NM_M', 'Diff_2_RELPOS', 'Rdiff_2_Depth', 'Rdiff_2_GR', 'Rdiff_2_ILD_log10', 'Rdiff_2_DeltaPHI', 'Rdiff_2_PHIND', 'Rdiff_2_PE', 'Rdiff_2_NM_M', 'Rdiff_2_RELPOS', 'Mean_5_Depth', 'Mean_5_GR', 'Mean_5_ILD_log10', 'Mean_5_DeltaPHI', 'Mean_5_PHIND', 'Mean_5_PE', 'Mean_5_NM_M', 'Mean_5_RELPOS', 'Max_5_Depth', 'Max_5_GR', 'Max_5_ILD_log10', 'Max_5_DeltaPHI', 'Max_5_PHIND', 'Max_5_PE', 'Max_5_NM_M', 'Max_5_RELPOS', 'Min_5_Depth', 'Min_5_GR', 'Min_5_ILD_log10', 'Min_5_DeltaPHI', 'Min_5_PHIND', 'Min_5_PE', 'Min_5_NM_M', 'Min_5_RELPOS', 'Std_5_Depth', 'Std_5_GR', 'Std_5_ILD_log10', 'Std_5_DeltaPHI', 'Std_5_PHIND', 'Std_5_PE', 'Std_5_NM_M', 'Std_5_RELPOS', 'Var_5_Depth', 'Var_5_GR', 'Var_5_ILD_log10', 'Var_5_DeltaPHI', 'Var_5_PHIND', 'Var_5_PE', 'Var_5_NM_M', 'Var_5_RELPOS', 'Diff_5_Depth', 'Diff_5_GR', 'Diff_5_ILD_log10', 'Diff_5_DeltaPHI', 'Diff_5_PHIND', 'Diff_5_PE', 'Diff_5_NM_M', 'Diff_5_RELPOS', 'Rdiff_5_Depth', 'Rdiff_5_GR', 'Rdiff_5_ILD_log10', 'Rdiff_5_DeltaPHI', 'Rdiff_5_PHIND', 'Rdiff_5_PE', 'Rdiff_5_NM_M', 'Rdiff_5_RELPOS', 'Mean_10_Depth', 'Mean_10_GR', 'Mean_10_ILD_log10', 'Mean_10_DeltaPHI', 'Mean_10_PHIND', 'Mean_10_PE', 'Mean_10_NM_M', 'Mean_10_RELPOS', 'Max_10_Depth', 'Max_10_GR', 'Max_10_ILD_log10', 'Max_10_DeltaPHI', 'Max_10_PHIND', 'Max_10_PE', 'Max_10_NM_M', 'Max_10_RELPOS', 'Min_10_Depth', 'Min_10_GR', 'Min_10_ILD_log10', 'Min_10_DeltaPHI', 'Min_10_PHIND', 'Min_10_PE', 'Min_10_NM_M', 'Min_10_RELPOS', 'Std_10_Depth', 'Std_10_GR', 'Std_10_ILD_log10', 'Std_10_DeltaPHI', 'Std_10_PHIND', 'Std_10_PE', 'Std_10_NM_M', 'Std_10_RELPOS', 'Var_10_Depth', 'Var_10_GR', 'Var_10_ILD_log10', 'Var_10_DeltaPHI', 'Var_10_PHIND', 'Var_10_PE', 'Var_10_NM_M', 'Var_10_RELPOS', 'Diff_10_Depth', 'Diff_10_GR', 'Diff_10_ILD_log10', 'Diff_10_DeltaPHI', 'Diff_10_PHIND', 'Diff_10_PE', 'Diff_10_NM_M', 'Diff_10_RELPOS', 'Rdiff_10_Depth', 'Rdiff_10_GR', 'Rdiff_10_ILD_log10', 'Rdiff_10_DeltaPHI', 'Rdiff_10_PHIND', 'Rdiff_10_PE', 'Rdiff_10_NM_M', 'Rdiff_10_RELPOS', 'Mean_25_Depth', 'Mean_25_GR', 'Mean_25_ILD_log10', 'Mean_25_DeltaPHI', 'Mean_25_PHIND', 'Mean_25_PE', 'Mean_25_NM_M', 'Mean_25_RELPOS', 'Max_25_Depth', 'Max_25_GR', 'Max_25_ILD_log10', 'Max_25_DeltaPHI', 'Max_25_PHIND', 'Max_25_PE', 'Max_25_NM_M', 'Max_25_RELPOS', 'Min_25_Depth', 'Min_25_GR', 'Min_25_ILD_log10', 'Min_25_DeltaPHI', 'Min_25_PHIND', 'Min_25_PE', 'Min_25_NM_M', 'Min_25_RELPOS', 'Std_25_Depth', 'Std_25_GR', 'Std_25_ILD_log10', 'Std_25_DeltaPHI', 'Std_25_PHIND', 'Std_25_PE', 'Std_25_NM_M', 'Std_25_RELPOS', 'Var_25_Depth', 'Var_25_GR', 'Var_25_ILD_log10', 'Var_25_DeltaPHI', 'Var_25_PHIND', 'Var_25_PE', 'Var_25_NM_M', 'Var_25_RELPOS', 'Diff_25_Depth', 'Diff_25_GR', 'Diff_25_ILD_log10', 'Diff_25_DeltaPHI', 'Diff_25_PHIND', 'Diff_25_PE', 'Diff_25_NM_M', 'Diff_25_RELPOS', 'Rdiff_25_Depth', 'Rdiff_25_GR', 'Rdiff_25_ILD_log10', 'Rdiff_25_DeltaPHI', 'Rdiff_25_PHIND', 'Rdiff_25_PE', 'Rdiff_25_NM_M', 'Rdiff_25_RELPOS', 'Mean_50_Depth', 'Mean_50_GR', 'Mean_50_ILD_log10', 'Mean_50_DeltaPHI', 'Mean_50_PHIND', 'Mean_50_PE', 'Mean_50_NM_M', 'Mean_50_RELPOS', 'Max_50_Depth', 'Max_50_GR', 'Max_50_ILD_log10', 'Max_50_DeltaPHI', 'Max_50_PHIND', 'Max_50_PE', 'Max_50_NM_M', 'Max_50_RELPOS', 'Min_50_Depth', 'Min_50_GR', 'Min_50_ILD_log10', 'Min_50_DeltaPHI', 'Min_50_PHIND', 'Min_50_PE', 'Min_50_NM_M', 'Min_50_RELPOS', 'Std_50_Depth', 'Std_50_GR', 'Std_50_ILD_log10', 'Std_50_DeltaPHI', 'Std_50_PHIND', 'Std_50_PE', 'Std_50_NM_M', 'Std_50_RELPOS', 'Var_50_Depth', 'Var_50_GR', 'Var_50_ILD_log10', 'Var_50_DeltaPHI', 'Var_50_PHIND', 'Var_50_PE', 'Var_50_NM_M', 'Var_50_RELPOS', 'Diff_50_Depth', 'Diff_50_GR', 'Diff_50_ILD_log10', 'Diff_50_DeltaPHI', 'Diff_50_PHIND', 'Diff_50_PE', 'Diff_50_NM_M', 'Diff_50_RELPOS', 'Rdiff_50_Depth', 'Rdiff_50_GR', 'Rdiff_50_ILD_log10', 'Rdiff_50_DeltaPHI', 'Rdiff_50_PHIND', 'Rdiff_50_PE', 'Rdiff_50_NM_M', 'Rdiff_50_RELPOS', 'Well Name', 'orig_index', 'above_Depth', 'above_GR', 'above_ILD_log10', 'above_DeltaPHI', 'above_PHIND', 'above_PE', 'above_NM_M', 'above_RELPOS', 'below_Depth', 'below_GR', 'below_ILD_log10', 'below_DeltaPHI', 'below_PHIND', 'below_PE', 'below_NM_M', 'below_RELPOS', 'Depth_GR', 'Depth_ILD_log10', 'Depth_DeltaPHI', 'Depth_PHIND', 'Depth_PE', 'Depth_NM_M', 'Depth_RELPOS', 'GR_Depth', 'GR_ILD_log10', 'GR_DeltaPHI', 'GR_PHIND', 'GR_PE', 'GR_NM_M', 'GR_RELPOS', 'ILD_log10_Depth', 'ILD_log10_GR', 'ILD_log10_DeltaPHI', 'ILD_log10_PHIND', 'ILD_log10_PE', 'ILD_log10_NM_M', 'ILD_log10_RELPOS', 'DeltaPHI_Depth', 'DeltaPHI_GR', 'DeltaPHI_ILD_log10', 'DeltaPHI_PHIND', 'DeltaPHI_PE', 'DeltaPHI_NM_M', 'DeltaPHI_RELPOS', 'PHIND_Depth', 'PHIND_GR', 'PHIND_ILD_log10', 'PHIND_DeltaPHI', 'PHIND_PE', 'PHIND_NM_M', 'PHIND_RELPOS', 'PE_Depth', 'PE_GR', 'PE_ILD_log10', 'PE_DeltaPHI', 'PE_PHIND', 'PE_NM_M', 'PE_RELPOS', 'NM_M_Depth', 'NM_M_GR', 'NM_M_ILD_log10', 'NM_M_DeltaPHI', 'NM_M_PHIND', 'NM_M_PE', 'NM_M_RELPOS', 'RELPOS_Depth', 'RELPOS_GR', 'RELPOS_ILD_log10', 'RELPOS_DeltaPHI', 'RELPOS_PHIND', 'RELPOS_PE', 'RELPOS_NM_M']



In [15]:

    
train_f, test_f = train_test_split(upsampled_frame_std, test_size = 0.2, 
                                   random_state = 72)



In [16]:

    
# --------------------------
# TPOT Generated Model
from sklearn.ensemble import ExtraTreesClassifier, VotingClassifier
from sklearn.pipeline import make_pipeline, make_union
from sklearn.preprocessing import FunctionTransformer
from sklearn.tree import DecisionTreeClassifier

exported_pipeline = make_pipeline(
    make_union(VotingClassifier([("est", ExtraTreesClassifier(criterion="entropy", max_features=0.36, n_estimators=500))]), FunctionTransformer(lambda X: X)),
    DecisionTreeClassifier()
)

exported_pipeline.fit(train_f[features], train_f['class'])









    Out[16]:





Pipeline(steps=[('featureunion', FeatureUnion(n_jobs=1,
       transformer_list=[('votingclassifier', VotingClassifier(estimators=[('est', ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='entropy',
           max_depth=None, max_features=0.36, max_leaf_nodes=None,
           min_impurity_s...it=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best'))])



In [17]:

    
exported_pipeline.score(test_f[features], test_f['class'])









    Out[17]:





0.87789799072642971



In [21]:

    
result = exported_pipeline.predict(test_f[features])

from sklearn.metrics import confusion_matrix
from classification_utilities import display_cm, display_adj_cm

conf = confusion_matrix(test_f['class'], result)
display_cm(conf, facies_labels, hide_zeros = True, display_metrics = True)

def accuracy(conf):
    total_correct = 0
    nb_classes = conf.shape[0]
    for i in np.arange(0, nb_classes):
        total_correct += conf[i][i]
    acc = total_correct / sum(sum(conf))
    return acc

print (accuracy(conf))

adjacent_facies = np.array([[1], [0, 2], [1], [4], [3, 5], [4, 6, 7], [5, 7], [5, 6, 8], [6, 7]])

def accuracy_adjacent(conf, adjacent_facies):
    nb_classes = conf.shape[0]
    total_correct = 0
    for i in np.arange(0, nb_classes):
        total_correct += conf[i][i]
        for j in adjacent_facies[i]:
            total_correct += conf[i][j]
    return total_correct / sum(sum(conf))

print(accuracy_adjacent(conf, adjacent_facies))









    



     Pred    SS  CSiS  FSiS  SiSh    MS    WS     D    PS    BS Total
     True
       SS    46     6     1                                        53
     CSiS     2   132    10                             1         145
     FSiS           4   115                                       119
     SiSh     1     1          31     1     3           2          39
       MS                       3    29     7           5          44
       WS                       3     5    77           9     1    95
        D                       1           1    16                18
       PS                 2           1     6     1    96     1   107
       BS                                   1                26    27

Precision  0.94  0.92  0.90  0.82  0.81  0.81  0.94  0.85  0.93  0.88
   Recall  0.87  0.91  0.97  0.79  0.66  0.81  0.89  0.90  0.96  0.88
       F1  0.90  0.92  0.93  0.81  0.73  0.81  0.91  0.87  0.95  0.88
0.877897990726
0.964451313756

Now load and process the test data set, then predict using the 'exported_pipeline' model.



In [19]:

    
test_path = '../validation_data_nofacies.csv'

# Read training data to dataframe
test = pd.read_csv(test_path)

# Set string features to integers

for i, value in enumerate(test['Formation'].unique()):
    test.loc[test['Formation'] == value, 'Formation'] = i
    
for i, value in enumerate(test['Well Name'].unique()):
    test.loc[test['Well Name'] == value, 'Well Name'] = i

# The first thing that will be done is to upsample and interpolate the training data,
# the objective here is to provide significantly more samples to train the regressor on and
# also to capture more of the sample interdependancy.
upsampled_arrays = []
test['orig_index'] = test.index

# Use rolling windows through upsampled frame, grouping by well name.

# Empty list to hold frames
mean_frames = []
above = []
below = []

for well, group in test.groupby('Well Name'):
    # Empty list to hold rolling frames
    constructor_list = []
    for f in resample_factors:
        
        working_frame = group[['Depth', 'GR', 'ILD_log10', 'DeltaPHI', 'PHIND', 'PE', 'NM_M',
       'RELPOS']]
        
        mean_frame = working_frame.rolling(window = f, center = True).mean().interpolate(method = 'index', limit_direction = 'both', limit = None)
        mean_frame.columns = ['Mean_{0}_{1}'.format(f, column) for column in mean_frame.columns]
        max_frame = working_frame.rolling(window = f, center = True).max().interpolate(method = 'index', limit_direction = 'both', limit = None)
        max_frame.columns = ['Max_{0}_{1}'.format(f, column) for column in max_frame.columns]
        min_frame = working_frame.rolling(window = f, center = True).min().interpolate(method = 'index', limit_direction = 'both', limit = None)
        min_frame.columns = ['Min_{0}_{1}'.format(f, column) for column in min_frame.columns]
        std_frame = working_frame.rolling(window = f, center = True).std().interpolate(method = 'index', limit_direction = 'both', limit = None)
        std_frame.columns = ['Std_{0}_{1}'.format(f, column) for column in std_frame.columns]
        var_frame = working_frame.rolling(window = f, center = True).var().interpolate(method = 'index', limit_direction = 'both', limit = None)
        var_frame.columns = ['Var_{0}_{1}'.format(f, column) for column in var_frame.columns]
        diff_frame = working_frame.diff(f, axis = 0).interpolate(method = 'index', limit_direction = 'both', limit = None)
        diff_frame.columns = ['Diff_{0}_{1}'.format(f, column) for column in diff_frame.columns]
        rdiff_frame = working_frame.sort_index(ascending = False).diff(f, axis = 0).interpolate(method = 'index', limit_direction = 'both', limit = None).sort_index()
        rdiff_frame.columns = ['Rdiff_{0}_{1}'.format(f, column) for column in rdiff_frame.columns]
        skew_frame = working_frame.rolling(window = f, center = True).skew().interpolate(method = 'index', limit_direction = 'both', limit = None)
        skew_frame.columns = ['Skew_{0}_{1}'.format(f, column) for column in skew_frame.columns]
        
        f_frame = pd.concat((mean_frame, max_frame, min_frame, std_frame, var_frame, diff_frame, rdiff_frame), axis = 1)
        
        constructor_list.append(f_frame)
        
    well_frame = pd.concat(constructor_list, axis = 1)
    well_frame['Well Name'] = well
    # orig index is holding the original index locations, to make extracting the results trivial
    well_frame['orig_index'] = group['orig_index']
    df = group.sort_values('Depth')
    u = df.shift(-1).fillna(method = 'ffill')
    b = df.shift(1).fillna(method = 'bfill')
    above.append(u[div_columns])
    below.append(b[div_columns])
    
    mean_frames.append(well_frame.fillna(method = 'bfill').fillna(method = 'ffill'))
    
frame = test
frame.index = frame['orig_index']
frame.drop(['orig_index', 'Well Name'], axis = 1, inplace = True)

for f in mean_frames:
    f.index = f['orig_index']

rolling_frame = pd.concat(mean_frames, axis = 0)
above_frame = pd.concat(above)
above_frame.columns = ['above_'+ column for column in above_frame.columns]
below_frame = pd.concat(below)
below_frame.columns = ['below_'+ column for column in below_frame.columns]
upsampled_frame = pd.concat((frame, rolling_frame, above_frame, below_frame), axis = 1)

features = [feature for feature in upsampled_frame.columns if 'class' not in feature]

std_scaler = preprocessing.StandardScaler().fit(upsampled_frame[features])
train_std = std_scaler.transform(upsampled_frame[features])

train_std_frame = upsampled_frame
for i, column in enumerate(features):
    train_std_frame.loc[:, column] = train_std[:, i]

upsampled_frame_std = train_std_frame

for feature in div_columns:
    for f in div_columns:
        if f == feature:
            continue
        upsampled_frame['{0}_{1}'.format(feature, f)] = upsampled_frame[f] / upsampled_frame[feature]
        
features = [feature for feature in upsampled_frame.columns if 'class' not in feature]



In [20]:

    
# Predict result on full sample set
result = exported_pipeline.predict(upsampled_frame[features])
# Add result to test set
upsampled_frame['Facies'] = result
# Output to csv
upsampled_frame.to_csv('05 - Well Facies Prediction - Test Data Set.csv')



In [ ]:

	class	Formation	Well Name	Depth	GR	ILD_log10	DeltaPHI	PHIND	PE	NM_M	RELPOS
0	3	A1 SH	SHRIMPLIN	2793.0	77.45	0.664	9.9	11.915	4.6	1	1.000
1	3	A1 SH	SHRIMPLIN	2793.5	78.26	0.661	14.2	12.565	4.1	1	0.979
2	3	A1 SH	SHRIMPLIN	2794.0	79.05	0.658	14.8	13.050	3.6	1	0.957
3	3	A1 SH	SHRIMPLIN	2794.5	86.10	0.655	13.9	13.115	3.5	1	0.936
4	3	A1 SH	SHRIMPLIN	2795.0	74.58	0.647	13.5	13.300	3.4	1	0.915

	class	Depth	GR	ILD_log10	DeltaPHI	PHIND	PE	NM_M	RELPOS
count	3232.000000	3232.000000	3232.000000	3232.000000	3232.000000	3232.000000	3232.000000	3232.000000	3232.000000
mean	4.422030	2875.824567	66.135769	0.642719	3.559642	13.483213	3.725014	1.498453	0.520287
std	2.504243	131.006274	30.854826	0.241845	5.228948	7.698980	0.896152	0.500075	0.286792
min	1.000000	2573.500000	13.250000	-0.025949	-21.832000	0.550000	0.200000	1.000000	0.010000
25%	2.000000	2791.000000	46.918750	0.492750	1.163750	8.346750	3.100000	1.000000	0.273000
50%	4.000000	2893.500000	65.721500	0.624437	3.500000	12.150000	3.551500	1.000000	0.526000
75%	6.000000	2980.000000	79.626250	0.812735	6.432500	16.453750	4.300000	2.000000	0.767250
max	9.000000	3122.500000	361.150000	1.480000	18.600000	84.400000	8.094000	2.000000	1.000000